In [1]:
import copy
from pyspark.sql import SQLContext
import json
sqlContext = SQLContext(sc)
df = sqlContext.jsonFile("./spark_tutorial_article.json")
gf = df.map(lambda x : (x[2],x[5],x[12]))
print type(gf)
#spark.read.json(sc.wholeTextFiles('./spark_tutorial_article.json').values())
In [2]:
#sc.textFile("./spark_tutorial_article.json").map(json.loads).take(1)[0][u'author']
In [3]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
from bs4 import BeautifulSoup
soup = BeautifulSoup(x)
text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
import jieba
r = list()
for term in jieba.cut(text):
if len(term) > 1 and checkword(term): r.append(term)
return r
def checkword(x):
return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
In [4]:
text_token = gf.map(lambda x: (x[0], getContent(x[1]), x[2]))
In [150]:
#text_token.first()
#text_token.first()
#text_token.count()
In [6]:
def cal_tf(tokens):
d = {}
for word in tokens:
if not word in d:
d[word] = 1
else:
d[word] += 1
for word in d:
d[word] = float(d[word])/len(tokens)
return d
text_token_tf = text_token.map(lambda x: cal_tf(x[1]))
In [7]:
#check text_token_tf
#text_token_tf.first()
In [8]:
def cal_idf(docs):
N = docs.count()
uniqueTokens = docs.map(lambda x : list(set(x[1])))
token_sum_tuples = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y)
return token_sum_tuples.map(lambda x : (x[0], float(N)/x[1]))
In [9]:
def TFIDF(tokens, idfs):
tfidf_Dict = {}
tfs = cal_tf(tokens)
for tk in tfs:
tfs[tk] = tfs[tk]*idfs[tk]
tfidf_Dict = tfs
return tfidf_Dict
In [10]:
doc_idfs = cal_idf(text_token)
doc_c = doc_idfs.collectAsMap() #my idf dict
text_tfidf = TFIDF(text_token.collect()[0][1], doc_c)
print text_token.collect()[0][0]
In [11]:
#check text_tfidf
#text_tfidf
#text_token.collect()[0][1]
In [12]:
import math
def dotprod(a, b):
dotsum = 0
for tk in a:
if tk in b:
dotsum += a[tk]*b[tk]
return dotsum
def norm(a):
return math.sqrt(dotprod(a,a))
def cossim(a, b):
return dotprod(a,b)/(norm(a) * norm(b))
In [13]:
def cosineSimilarity(string1, string2, idfsDictionary):
w1 = tfidf(string1, idfsDictionary)
w2 = tfidf(string2, idfsDictionary)
return cossim(w1, w2)
In [14]:
def showTopWord(link):
tokens = text_token.filter(lambda x: x[2] == link).collect()[0][1]
tokens_weights = TFIDF(tokens, doc_c)
print type(tokens_weights)
tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
for index in range(0,9):
print tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]]
print tokens_weights_sorted[:14]
return tokens_weights_sorted[:14]
In [15]:
link = u'http://lovecc6.pixnet.net/blog/post/73513867'
#showTopWord(link)
In [16]:
urls = text_token.map(lambda x : x[2])
#top_word_list = text_token.map(lambda x : showTopWord(x[2]))
#top_word_list = [showTopWord(i) for i in urls]
#top_word_list = urls.map(lambda x: showTopWord(x))
#top_word_list
In [17]:
#top_word_list
In [18]:
query_input = [u'蝦球', u'辣味', u'泰式']
def check_in(query, text):
count = 0
for q in query:
if q in text:
count += 1
return count
def query_points(query):
query_points_table = text_token.map(lambda x : check_in(query, x[1]))
return query_points_table
In [19]:
query_pts = query_points(query_input).collect()
len(query_pts)
Out[19]:
In [20]:
def term_weights(tokens):
d = {}
for word in tokens:
if not word in d:
d[word] = 1
else:
d[word] += 1
return d
In [21]:
def term_points(query, point_dict):
points = 0
for i in query:
if i in point_dict:
points += point_dict[i]
return points
tf_list = text_token.map(lambda x : term_weights(x[1])).collect()
In [22]:
term_pts = [term_points(query_input, i) for i in tf_list]
len(term_pts)
Out[22]:
In [23]:
def doc_points(term_weight_pts, query_pts):
# tw_dict = text_token.map(lambda x: term_weights(x[1])).collect()
# doc_point = text_token.map(lambda x : (((term_points(query_input, tw_dict))*(check_in(query_input, x[1])) , x[2])))
doc_point = [i*j for i,j in zip(term_weight_pts, query_pts)]
return doc_point
In [24]:
url_list = text_token.map(lambda x : (x[2]))
total_pts = zip(doc_points(term_pts, query_pts) , url_list.collect())
In [25]:
#print type(total_pts)
total_pts_sort = sorted(total_pts, reverse=True)
#total_pts_sort
In [26]:
total_pts_sort[:10]
Out[26]:
In [27]:
def exact_match(restaurant, title):
if len(restaurant) < 3:
return 0
exact_match_flag = 0;
if restaurant in title:
exact_match_pts = 1
return 1
else:
return 0
In [176]:
def title_checkword(x):
return all((u'\u4e00' <= c <= u'\u9fff') or ('A' <= c <= 'Z') or ('a' <= c <= 'z') or (
'0' <= c <= '9')for c in x)
def cut_title(title):
import jieba
r = list()
for term in jieba.cut(title):
if title_checkword(term): r.append(term)
return r
In [181]:
cut_title(article_info.first()[0])
Out[181]:
In [151]:
article_info = df.map(lambda x : (x[11],x[12]))
print type(article_info)
article_info.first()
In [182]:
def bio_wordset(words):
try:
biogram_str = map(lambda x, y: x+y, words[:-1], words[1:])
return biogram_str
except:
return []
In [183]:
bio_test_1 = cut_title(article_info.first()[0])
print type(bio_test_1)
bio_test_2 = bio_wordset(bio_test_1)
print bio_test_2
In [184]:
import pandas as pd
res = sc.textFile('./restaurant.csv').map(lambda line: line.split(',')).map(
lambda line: line[2]).collect()[1:]
ex_digit = sc.textFile('./exchange_word.csv').map(lambda line: line.split(',')).map(
lambda line: (line[0],line[1])).filter(lambda x: x[0].isdigit()).collect()
ex_word = sc.textFile('./exchange_word.csv').map(lambda line: line.split(',')).map(
lambda line: (line[0],line[1])).filter(lambda x: not (x[0].isdigit())).collect()
res_data = list(set(res))
In [185]:
print ex_word
print ex_digit
In [157]:
def check_word(title_str):
for w in ex_word:
if w[0] in title_str:
return True
return False
In [96]:
def change_word(title_str):
for w in ex_word:
if w[0] in title_str:
return title_str.replace(w[0], w[1])
return title_str
In [170]:
#需要丟入斷過詞的title list
def change_list(input_list):
result = ''.join([change_word(i) for i in input_list])
return result
In [188]:
change_list_test = u'32比較公雞燒肉'
change_list_test = change_list(cut_title(change_list_test))
print change_list_test
In [186]:
def change_num(input_str):
output_str = ''
for index, i in enumerate(input_str):
count = 0
for j in ex_digit:
if i == j[0]:
count += 1
output_str += j[1]
try:
if input_str[index+1].isdigit():
output_str += u'十'
except:
continue
if count == 0:
output_str += i
return output_str
In [187]:
change_num_test = u'32比較公雞迴轉'
change_num_test = change_num(change_num_test)
print change_num_test
In [189]:
#for i in res_data:
# print i
In [190]:
def creat_long(short_str):
result = [p for p in short_str if (len(p) > 1)]
return result
In [191]:
creat_long(cut_title(article_info.first()[0]))
Out[191]:
In [192]:
def separate_eng(input_str):
result = list()
for i in input_str:
if i.isalpha() and (('A' <= i[0] <= 'Z') or ('a' <= i[0] <= 'z')):
result.append(i)
return result
In [193]:
aa = separate_eng(cut_title(article_info.first()[0]))
#print aa
#print article_info.collect()
In [194]:
def long_term_compare(title, name):
count = 0
for i in title:
if i in name:
count += 1
return count
In [195]:
def bio_long_term_compare(title, name):
count = 0
for i in title:
if i in name:
count += 1
return count
In [196]:
def term_compare(title, name):
term_count = 0
for i in title:
if i in name:
term_count += 1
return term_count
In [197]:
def bio_term_compare(title, name):
term_count = 0
for i in title:
if i in name:
term_count += 1
return term_count
In [198]:
q1 = ['豆腐','好臭','誰的','好香','滴油']
q2 = ['豆腐','好臭','誰的']
term_compare(q1, q2)
Out[198]:
In [200]:
#將六個參數帶入計分 並且儲存計分分佈 分別為: 短詞比對,短詞相接比對,長詞比對,長詞相接比對,英文比對,完全比對
def calculate_pts(short_uni, short_bio, long_uni, long_bio, eng_name, exact, pts_record):
a = list()
a.append(short_uni)
a.append(short_bio)
a.append(long_uni)
a.append(long_bio)
a.append(eng_name)
a.append(exact)
pts_record.append(a)
return ((1*short_uni)+(2*short_bio)+(1*long_uni)+(4*long_bio)+(4*eng_name)+(100000*exact))
In [135]:
def predict_restaurant(title):
#將需要的字串從標題斷出來
title_token = cut_title(change_num(title))
biogram_title_token = bio_wordset(title_token)
long_title_token = creat_long(title_token)
long_biogram_title_token = bio_wordset(long_title_token)
Eng_title_token = separate_eng(title_token)
pts_list = list()
pts_record = list()
exact_pts = 0
for i in res_data:
#對每個餐廳名做需要的處理
exact_pts = exact_match(i, title)
name_token = cut_title(change_num(i))
biogram_name_token = bio_wordset(name_token)
long_name_token = creat_long(name_token)
long_biogram_name_token = bio_wordset(long_name_token)
Eng_name_token = separate_eng(name_token)
#計算參數
short_uni = term_compare(title_token, name_token)
short_bio = bio_term_compare(biogram_title_token,biogram_name_token)
long_uni = long_term_compare(long_title_token, long_name_token)
long_bio = bio_long_term_compare(long_biogram_title_token, long_biogram_name_token)
eng_name = term_compare(Eng_title_token, Eng_name_token)
#導入計分函式
pts_list.append(calculate_pts(
short_uni, short_bio, long_uni, long_bio, eng_name, exact_pts, pts_record))#
#整理取前三名
rank_list = zip(pts_list, res_data, pts_record)
rank_list_sorted = sorted(rank_list, reverse = True)
return rank_list_sorted[:3]
In [201]:
last_test = article_info.map(lambda x: x[0]).map(lambda x : predict_restaurant(x))
In [59]:
rr = [u'【小宅食記】喜來登kitchen 12早餐吃到飽|美好一天的開始:台北市中正區',u'雲軒西餐廳 La Rotisserie - 君品酒店',
u'非凡大探索-吃到飽-喜來登十二廚下午茶',u'十二廚自助餐廳 - 台北喜來登大飯店',u'槿韓食堂 -韓式料理吃到飽(1F)']
In [117]:
def test_2(inin):
for i in inin:
print '======================'
title_token = cut_title(i)
for j in title_token:
print j
print('---------------')
biogram_title_token = bio_wordset(title_token)
for j in biogram_title_token:
print j
print('---------------')
long_title_token = creat_long(title_token)
for j in long_title_token:
print j
print('---------------')
long_biogram_title_token = bio_wordset(long_title_token)
for j in long_biogram_title_token:
print j
print('---------------')
Eng_title_token = separate_eng(title_token)
for j in Eng_title_token:
print j
print('---------------')
In [202]:
#test_2(rr)
In [203]:
predict_result = last_test.collect()
In [205]:
#predict_result
In [3]:
#for index,i in enumerate(qqq):
# print index
# for j in i:
# print j[1]
In [209]:
def evaluation(title, prediction):
count = 0
for a, b in zip(title, prediction):
print count
print ('===========================')
print a
print b[0][1]
print b[0][2]
print ('===========================')
count += 1
In [210]:
evaluation(last_test_1, qqq)
In [ ]: